Statistics 141B: Data and Web Technologies
Contributors: Jeremy Weidner, Weizhou Wang, Audrey Chu, and Yuji Mori
To study NASDAQ stock prices for the technology, finance, health care, and energy industry sectors across time. With the application of python and utilization of the Yahoo! Finance API, Yahoo Query Language (YQL), and New York Times API, we will gather and clean out a dataset for a time period of ten years for approximately 1770 companies. Our data will incorporate the closing prices for each day and then average these prices for each respective sector. In analyzing the stock prices, we will use interactive data visualization as well as attempt to create a time series ARIMA (Autoregressive Integrated Moving Average) model.
import sys
import csv
import json
import requests
import requests_cache
import numpy as np
import pandas as pd
#from yahoo_finance import Share
from pprint import pprint
from datetime import datetime
import matplotlib.pyplot as plt
import math
import matplotlib as mpl
%matplotlib inline
import seaborn as sns
sns.set_style('white', {"xtick.major.size": 2, "ytick.major.size": 2})
flatui = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71","#f4cae4"]
sns.set_palette(sns.color_palette(flatui,7))
import missingno as msno
#this is for time series
from __future__ import print_function
from scipy import stats
import statsmodels.api as sm
from statsmodels.graphics.api import qqplot
#this is for plotting the prices
import plotly
plotly.tools.set_credentials_file(username="audchu",api_key="fPSZjEJ6wqYjzolZ8wNI")
import plotly.plotly as py
import plotly.graph_objs as go
from datetime import datetime
from pandas_datareader import data,wb
#this is for streaming plot
from plotly.grid_objs import Grid,Column
import time
# Accessing the NYT API
from nytimesarticle import articleAPI
# Webscraping and Text Processing
from bs4 import BeautifulSoup
import urllib2
from urllib2 import urlopen
import string
import nltk
import regex as re
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import wordcloud
import pickle
requests_cache.install_cache('cache')
# Yahoo! YQL API
PUBLIC_API_URL = 'https://query.yahooapis.com/v1/public/yql'
OAUTH_API_URL = 'https://query.yahooapis.com/v1/yql'
DATATABLES_URL = 'store://datatables.org/alltableswithkeys'
def myreq(ticker, start, end):
'''
input ticker & dates as strings form 'YYYY-MM-DD'
'''
params = {'format':'json',
'env':DATATABLES_URL}
query = 'select * from yahoo.finance.historicaldata where symbol = "{}" and startDate = "{}" and endDate = "{}"'.format(ticker,start, end)
params.update({'q':query})
req = requests.get(PUBLIC_API_URL, params=params)
req.raise_for_status()
req = req.json()
if req['query']['count'] > 0:
result = req['query']['results']['quote']
return result
else:
pass
def price2(ticker):
"""
Return closing prices for stocks from years 2006 to 2016.
"""
date=[]
price=[]
report = []
years = [2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016]
for x in range(len(years)):
c = myreq(ticker,'{}-01-01'.format(years[x]),'{}-12-31'.format(years[x]))
try:
for i in range(0,len(c)):
date.append(pd.to_datetime(c[i]["Date"]))
price.append(float(c[i][u'Close']))
datef = pd.DataFrame(date)
pricef = pd.DataFrame(price)
table1 = pd.concat([datef,pricef],axis = 1)
table1.columns = ['Date', ticker]
table1 = table1.set_index("Date")
except Exception:
table1 = pd.DataFrame()
return table1
We begin by extracting our main dataset which comes from the Yahoo Finance API. We extracted stock closing prices for each stock currently in the Nasdaq. We do this via requests using a YQL query to the historical data table of the API. Not every stock currently in the Nasdaq was around from when we started extracting in 2006 and so they're values are NA until they IPO. The weights of these additions when they happen are not believed to be of substantial impact to our analysis of price because we aggregate over 4 sectors across roughly 1700 stocks. If you look closely at the function it iterates by year because if you request over 364 values there is an undocumented error where you get returned an empty JSON object.
csv = pd.read_csv('./companylist.csv')
# We want to keep "Finance, Health Care, Technology, Energy"
newcsv = csv[csv["Sector"].isin(["Finance", "Energy","Health Care","Technology"])].reset_index()
del newcsv["index"]
whole_list = newcsv['Symbol']
'''
for l in whole_list:
get = price2(l)
try:
df = pd.concat([df,get],axis = 1) # concat. by column
except NameError:
df = pd.DataFrame(get) # initialize automatically
# SAVE THE RESULT LOCALLY:
df.to_pickle('mydf')
'''
df = pd.read_pickle('mydf')
# This allows us to control size of a dataframe displayed to examine our data in depth
pd.options.display.max_columns = 20
pd.options.display.max_rows = 30
df.head()
final = newcsv.reset_index()
df_long = df.transpose()
sector = final[['Symbol','Sector']]
sector = sector.set_index('Symbol')
final = df_long.join(sector)
final.head()
hc = final[final['Sector'] == 'Health Care']
# take median within groups for each recorded date:
avg_sector = final.groupby('Sector').median().reset_index('Sector')
avg_sector = avg_sector.set_index('Sector')
avg_sector = avg_sector.dropna(thresh=4, axis = 1) # this drops if a column does not have at least two non NA's
# Dates as index for plotting
# This is basically the original DF (transposed and transposed back)
# but the columns are now the Sector averages.
avg_T = avg_sector.transpose()
avg_T.head()
def ts_slider(sector,sec_name):
trace = go.Scatter(x=avg_T.index,y=sector)
data = [trace]
layout = dict(
title=sec_name + ' Sector Closing Prices: Time series with Range Slider',
xaxis=dict(
rangeselector=dict(
buttons=list([
dict(count=1,
label='1m',
step='month',
stepmode='backward'),
dict(count=6,
label='6m',
step='month',
stepmode='backward'),
dict(count=1,
label='YTD',
step='year',
stepmode='todate'),
dict(count=1,
label='1y',
step='year',
stepmode='backward'),
dict(step='all')
])
),
rangeslider=dict(),
type='date'
)
)
fig = dict(data=data, layout=layout)
return py.iplot(fig)
ts_slider(avg_T.Energy,"Energy")
# A new DF for the difference between each day:
delta_df = pd.DataFrame()
for sect in avg_T.columns:
delta_df[sect] = np.log(avg_T[sect].shift(1)) - np.log(avg_T[sect])
delta_df.columns = map(lambda name: '{} Changes'.format(name),avg_T.columns)
# On what day did the stock price spike the most?
abs(delta_df).idxmax()
shade = delta_df['Energy Changes']
delta_df.head()
plot_cols = list(delta_df)
# 2 axes for 2 subplots
fig, axes = plt.subplots(4,1, figsize=(10,7), sharex=True)
#delta_df[plot_cols].plot(subplots=True, ax=axes)
delta_df[plot_cols].plot(subplots=True, ax=axes)
#plt.ylim([-0.20,0.150])
plt.show()
avg_T.index= pd.to_datetime(avg_T.index)
avg_month = avg_T.groupby([avg_T.index.year, avg_T.index.month]).mean()
avg_month.head()
plot_cols
# A new DF for the difference between each month's average:
delta_avg = pd.DataFrame()
for sect in avg_month.columns:
delta_avg[sect] = np.log(avg_month[sect].shift(1)) - np.log(avg_month[sect])
delta_avg.columns = map(lambda name: '{} Changes'.format(name),avg_month.columns)
col = []
for i in range(len(delta_avg)):
dt = delta_avg.index[i] + (10, 10, 10, 10)
dt_obj = datetime(*dt[0:6])
col.append(pd.to_datetime(dt_obj))
delta_avg['Timestamp'] = col
delta_avg = delta_avg.set_index('Timestamp')
delta_avg
plot_cols = list(delta_avg)
# 2 axes for 2 subplots
fig, axes = plt.subplots(4,1, figsize=(10,7), sharex=True)
#delta_df[plot_cols].plot(subplots=True, ax=axes)
delta_avg[plot_cols].plot(subplots=True, ax=axes)
#plt.ylim([-0.20,0.150])
plt.show()
(abs(delta_avg).iloc[1:,:].quantile(q=0.90, axis=0))
peak = delta_avg[(abs(delta_avg) >= 0.163191).any(axis=1)]
peak.head()
plot_cols = list(delta_avg)
fig, axes = plt.subplots(4,1, figsize=(10,7), sharex=True)
delta_avg[plot_cols].plot(subplots=True, ax=axes)
for shade in range(len(peak)):
peak_bgn = peak.index[shade]
if peak.index[shade].month == 12:
year = peak.index[shade].year+1
end = (year, 1, 10, 10, 10, 10)
dt_obj = datetime(*end[0:6])
peak_end = pd.to_datetime(dt_obj)
else:
mo = peak.index[shade].month + 1
end = (peak.index[shade].year, mo, 10, 10, 10, 10)
dt_obj = datetime(*end[0:6])
peak_end = pd.to_datetime(dt_obj)
for ax in axes:
ax.axvspan(peak_bgn, peak_end, color=sns.xkcd_rgb['grey'], alpha=.5)
ax.axhline(0, color='k', linestyle='-', linewidth=1)
plot_cols = list(avg_T)
fig, axes = plt.subplots(4,1, figsize=(10,7), sharex=True)
avg_T[plot_cols].plot(subplots=True, ax=axes)
for shade in range(len(peak)):
peak_bgn = peak.index[shade]
if peak.index[shade].month == 12:
year = peak.index[shade].year+1
end = (year, 1, 10, 10, 10, 10)
dt_obj = datetime(*end[0:6])
peak_end = pd.to_datetime(dt_obj)
else:
mo = peak.index[shade].month + 1
end = (peak.index[shade].year, mo, 10, 10, 10, 10)
dt_obj = datetime(*end[0:6])
peak_end = pd.to_datetime(dt_obj)
for ax in axes:
ax.axvspan(peak_bgn, peak_end, color=sns.xkcd_rgb['grey'], alpha=.5)
ax.axhline(0, color='k', linestyle='-', linewidth=1)
peak.index[0]
peak.index[1].month - peak.index[0].month == 0
peak_shade = []
for l in range(len(peak)-1):
if peak.index[l+1].year == peak.index[1].year:
if peak.index[l+1].month - peak.index[1].month == 1:
start = peak.index[l]
else:
start = peak.index[l-1]
peak_shade.append(start)
else:
start = peak.index[l-1]
peak_shade.append(start)
peak_shade
peak
# double check, would it make more sense to look at sum of absolute changes?
# This wouldn't give us direction though.
abs_delta_df = abs(delta_df)
months = abs_delta_df.groupby(delta_df.index.month).sum()
months
months["total"] = months.sum(axis=1)
months.head()
sns.set_style("white")
sns.set_context({"figure.figsize": (20, 10)})
sns.barplot(x = months.index, y = months.total, color = "red")
health_plot = sns.barplot(x = months.index, y = months['Health Care Changes']+months['Energy Changes']+months['Finance Changes'], color = "yellow")
fin_plot = sns.barplot(x = months.index, y = months['Finance Changes']+months['Energy Changes'], color = "blue")
eng_plot = sns.barplot(x = months.index, y = months['Energy Changes'], color = "green")
topbar = plt.Rectangle((0,0),1,1,fc="red", edgecolor = 'none')
bottombar = plt.Rectangle((0,0),1,1,fc='#0000A3', edgecolor = 'none')
l = plt.legend([bottombar, topbar], ['Energy Changes', 'Finance Changes'], loc=1, ncol = 2, prop={'size':16})
l.draw_frame(False)
#Optional code - Make plot look nicer
sns.despine(left=True)
eng_plot.set_ylabel("Total Absolute Closing Price Changs")
eng_plot.set_xlabel("Months")
#Set fonts to consistent 16pt size
for item in ([bottom_plot.xaxis.label, bottom_plot.yaxis.label] +
bottom_plot.get_xticklabels() + bottom_plot.get_yticklabels()):
item.set_fontsize(16)
# 261 not right number, why is our last day 12-30-2016? -261 maybe?
year = delta_df.iloc[-100:,:]
plot_cols = list(year)
# 2 axes for 2 subplots
fig, axes = plt.subplots(4,1, figsize=(10,7), sharex=True)
#delta_df[plot_cols].plot(subplots=True, ax=axes)
year[plot_cols].plot(subplots=True, ax=axes)
#plt.ylim([-0.20,0.150])
plt.show()
The New York Times Article Search API is capable of extracting decades worth of news records. We utilized the API in order to extract over 9,000 relevant articles, filtered by the four designated sectors. To visualize the data, we have developed an interactive plot.ly timeline that allows users to navigate a 10-year period's worth of news headlines.
# OLD KEY (hit my limit): api = articleAPI('2679a66fe8df4740b754f98e52ad068c')
api = articleAPI('e031fcaf03da4b3c949e505c4aa69a5b')
def news_articles(sector,start,end,pages):
sector_df = pd.DataFrame()
for i in range(pages):
try:
if sector == 'Health Care':
sector_articles = api.search(
q = 'Health',
fq = {
'news_desk':'Business',
'section_name':'Business',
'subject.contains':['Health','Drugs'],
'type_of_material':'News'
},
begin_date = start,
end_date = end,
sort = 'oldest',
page = i
)
if sector == 'Technology':
sector_articles = api.search(
fq = {
'news_desk.contains':'Business',
'section_name':'Technology',
'subject.contains':['Acquisitions'],
'type_of_material':'News'
},
begin_date = start,
end_date = end,
sort = 'oldest',
page = i
)
if sector == 'Energy':
sector_articles = api.search(
q = 'Energy & Environment',
fq = {
'news_desk':'Business',
'subject.contains':['Energy','Renewable','Gas','Acquisitions'],
'section_name':'Business',
'type_of_material':'News'
},
begin_date = start,
end_date = end,
sort = 'oldest',
page = i
)
if sector == 'Finance':
sector_articles = api.search(
q = 'Finance',
fq = {
'news_desk':'Business',
'subject.contains':['Banking','Financial'],
'section_name':'Business',
'type_of_material':'News'
},
begin_date = start,
end_date = end,
sort = 'oldest',
page = i
)
df_i = sector_articles['response']['docs']
sector_df = sector_df.append(df_i)
time.sleep(0.5) # API only allows 5 calls per second. This slows it down!
except KeyError:
break
except IndexError:
break
return sector_df.reset_index()
Asking for over 100 pages at once (necessary for a 10-year span) leads to unpredictable results.
news_articles(<sector>,20060101,20170101,500), but the function stops running around 110~120 pages. This is likely due to usage restrictions imposed by the API.'''
The following code locally stores the news data. Please do not run this block!
healthcare_news1 = news_articles('Health Care',20060101,20091231,100)
healthcare_news2 = news_articles('Health Care',20100101,20131231,100)
healthcare_news3 = news_articles('Health Care',20140101,20161231,100)
healthcare_news = pd.concat([healthcare_news1,healthcare_news2,healthcare_news3],ignore_index=True)
healthcare_news['Sector'] = 'Health Care'
tech_news = news_articles('Technology',100)
tech_news['Sector'] = 'Technology'
energy_news1 = news_articles('Energy',20060101,20091231,100)
energy_news2 = news_articles('Energy',20100101,20131231,100)
energy_news3 = news_articles('Energy',20140101,20161231,100)
energy_news = pd.concat([energy_news1,energy_news2,energy_news3])
energy_news['Sector']='Energy'
finance_news1 = news_articles('Finance',20060101,20091231,100)
finance_news2 = news_articles('Finance',20100101,20131231,100)
finance_news3 = news_articles('Finance',20140101,20161231,100)
finance_news = pd.concat([finance_news1,finance_news2,finance_news3])
finance_news['Sector']='Finance'
all_news = pd.concat([healthcare_news,tech_news,energy_news,finance_news],ignore_index=True)
# all_news.to_pickle('news_df')
'''
all_news = pd.read_pickle('news_df')
# all_news = all_news.reset_index()
# Convert dates to index
all_news['pub_date'] = pd.to_datetime(all_news.pub_date)
all_news = all_news.set_index(all_news['pub_date'])
headlines = [d.get('main') for d in all_news.headline]
dates = list(pd.to_datetime(all_news['pub_date']))
# Pandas categorical objects have integer values mapped to them:
sector_levels = all_news['Sector'].astype('category').cat.codes
# Data Frame for plot.ly Timeline:
pltdf = pd.DataFrame(
{'Date':dates,
'Title':headlines,
'Sector':all_news.Sector,
'Level':sector_levels
})
import plotly.plotly as py
import plotly.graph_objs as go
fig = {
'data': [
{
'x': pltdf[pltdf['Sector']==sector]['Date'],
'y': pltdf[pltdf['Sector']==sector]['Level'],
'text': pltdf[pltdf['Sector']==sector]['Title'],
'name': sector, 'mode': 'markers',
} for sector in reversed(all_news['Sector'].astype('category').cat.categories)
]
}
py.iplot(fig, filename='plotly_test')
The Timeline
The articles found in this tool are primarly from the Business category of the NYT API to ensure that the majority of the headlines have a high relevance to the stock market indicies we analyzed. At the same time, the news encompasses a wide array of contexts (legislation, judicial processes, company mergers and acquisitions, international events, R&D, and others). Even company-specifc headlines usually target the largest corporartions within their respective industries.
Though the timeline is a cool concept, its scope is too wide to make any substantial conclusions. To further our analysis on average stock prices, we return to the volatility plots, specifically focusing on the shaded regions of highest volatiliy. From these time periods, we analyzed the text from snippets of the articles to calculate word frequencies. The top 100 words from each sector were selected and displayed on a word cloud.
We have visualized two periods where the volatility persisted for an exceptional length:
tokenize = nltk.word_tokenize
def stem(tokens,stemmer = PorterStemmer().stem):
return [stemmer(w.lower()) for w in tokens]
# http://stackoverflow.com/questions/11066400/remove-punctuation-from-unicode-formatted-strings
def remove_punctuation(text):
return re.sub(ur"\p{P}+", "", text)
# Extract text from a NYT news article URL:
def url_text(url):
#doc = urlopen(url)
# Must override HTTP redirects...
# SOURCE: http://stackoverflow.com/questions/9926023/handling-rss-redirects-with-python-urllib2
doc = urllib2.build_opener(urllib2.HTTPCookieProcessor).open(url)
soup = BeautifulSoup(doc, 'html.parser')
tags = soup.body.find_all('p',{"class":"story-body-text story-content"})
corpus = []
for tag in tags:
corpus.append(tag.text.strip())
corpus =' '.join(corpus)
return(corpus)
# Create a corpus which contains text from all articles in the sector
def sector_corpus(data):
'''
requires dataframe input (a subset by date of all_news).
Outputs one large string from every URL found in in the dataframe.
'''
links = data.web_url
full_sector = []
for link in links:
try:
full_article = url_text(link)
except:
full_article = ''
full_sector.append(full_article)
return(' '.join(full_sector))
# Process the text and return a dictionary of term:frequency
def BetterClouds(data):
'''
Applies the sector_corpus() function and returns the top 100 words from said corpus.
'''
text = sector_corpus(data)
text = text.lower()
text = remove_punctuation(text)
tokens = tokenize(text)
ignore = ['said','say','$','like','also','like','or','would']
filtered = [w for w in tokens if not w in stopwords.words('english')+ignore]
filtered = stem(filtered)
count = Counter(filtered)
# top 100 words:
top_words = dict(count.most_common(100))
return(top_words)
#cloud = wordcloud.WordCloud(background_color="white")
#wc = cloud.generate_from_frequencies(top_words)
#return(wc)
# (1) Subset the data:
sample1 = all_news['2008-8-10 10:10:10':'2008-11-10 10:10:10']
sample1_sector = sample1.groupby("Sector") # a groupby object
# (2) Subset the data:
sample2 = all_news['2015-12-10 10:10:10':'2016-03-10 10:10:10']
sample2_sector = sample2.groupby("Sector")
# List of dictionaries with term:frequency for each sector.
# NOTE: This takes an eternity to run (scraping and processing hundreds of pages).
# These lists have been saved locally as 'termfreq_2008' and 'termfreq_2016' respectively.
# ORIGINAL CODE:
# word_freq_list = [BetterClouds(sample1_sector.get_group(sect)) for sect in all_news.Sector.unique()]
# word_freq_list2 =[BetterClouds(sample2_sector.get_group(sect)) for sect in all_news.Sector.unique()]
# SAVE LISTS LOCALLY:
#with open('termfreq_2008', 'wb') as fp:
#pickle.dump(word_freq_list, fp)
#with open('termfreq_2016', 'wb') as fp:
#pickle.dump(word_freq_list2, fp)
# Load the already-processed results for quick access.
with open ('termfreq_2008', 'rb') as fp:
word_freq_list = pickle.load(fp)
with open ('termfreq_2016', 'rb') as fp:
word_freq_list2 = pickle.load(fp)
# Create frequency bar plots for the news articles found in Aug2008--Nov2008.
#sns.set_style("dark")
for counter,sector in enumerate(word_freq_list):
top10 = sorted(sector.items(), key=lambda kv: kv[1], reverse=True)[:10]
terms,frequency = zip(*top10)
current_sect = all_news.Sector.unique()[counter]
plt.figure(counter,figsize=(6,3))
freq_plot = sns.barplot(terms,frequency,palette='GnBu')
freq_plot.set_title("Top 10 %s Terms: Aug.2008 to Nov.2008"%current_sect)
for item in freq_plot.get_xticklabels():
item.set_rotation(45)
# Support the frequency plot with a word cloud.
fig = plt.figure(figsize=(20,10))
fig.suptitle("NYT Most Used Terms: Aug.2008 to Nov.2008", fontsize=30)
cloud = wordcloud.WordCloud(background_color="white")
for counter, sect in enumerate(word_freq_list):
current_sect = all_news.Sector.unique()[counter]
wc = cloud.generate_from_frequencies(sect)
ax = fig.add_subplot(2,2,counter+1)
ax.set_title("%s Sector" %current_sect,fontsize=20)
plt.imshow(wc,aspect='auto')
plt.axis("off")
Each sector has several key words that dominate their respective domains. Because these articles were originally collected using certain search parameters for the New York Times API, some terms (i.e. the sector names themselves) are unfortunately overrepresented. However, this design did not prevent other terms from taking the number one spot on the frequency plots. For example, "bank" tops the Financial plots with almost triple the frequency of the second most common term.
Notably, there are very few instances of term overlap. For example, it is clear that "compani" (assumably "company") is a commonly used word in articles from all sectors. Aside from this instance, it can be argued that the news articles from each sector are unique and distinguishable in their vocabulary.
# Create frequency bar plots for the news articles found in Dec2015--Mar2016.
for counter,sector in enumerate(word_freq_list2):
top10 = sorted(sector.items(), key=lambda kv: kv[1], reverse=True)[:10]
terms,frequency = zip(*top10)
current_sect = all_news.Sector.unique()[counter]
plt.figure(counter,figsize=(6,3))
freq_plot = sns.barplot(terms,frequency,palette='GnBu')
freq_plot.set_title("Top 10 %s Terms: Dec.2015 to Mar.2016"%current_sect)
for item in freq_plot.get_xticklabels():
item.set_rotation(45)
# Support the frequency plot with a word cloud.
fig = plt.figure(figsize=(20,10))
fig.suptitle("NYT Most Used Terms: Dec.2015 to Mar.2016", fontsize=30)
cloud = wordcloud.WordCloud(background_color="white")
for counter, sect in enumerate(word_freq_list2):
current_sect = all_news.Sector.unique()[counter]
wc = cloud.generate_from_frequencies(sect)
ax = fig.add_subplot(2,2,counter+1)
ax.set_title("%s Sector" %current_sect,fontsize=20)
plt.imshow(wc,aspect='auto')
plt.axis("off")
Though the most used terms remain more or less the same, a noticable change in vocabulary occurs in the Technology sector. For one, more companies are named explicity, with Yahoo and Grindr taking spots in the top 10 (on the word cloud, companies like Google and Apple make an appearance as well). In addition, the terms on this 2015-2016 word cloud seem to exclude "business terms" such as "share","deal",and "market", all of which were in the top 10 of the 2008 articles.
Yet for the other sectors, many of the major terms remain the same; "compani" remains in the first or second spot for Health Care, Tech, and Energy, and "bank" continues to dominate Finance. Thus, the terms that convey the most information likely exist somewhere in between the top 10 to 100 unigrams collected, as words like "game" and "esport" become more popular the Tech industry, and entities like China make their way into financial news. Overall, though the difference here is only a few years, the changes in term frequency may correspond to a shift in industry interests.
ts_eng = delta_df['Energy Changes']
# Why do we get an NA for Nov 1 2016?
# Need to change following date range
# ts_eng['2016-11-02':'2017-03-01']
Dickey-Fuller Test: tests the null hypothesis of whether a u nit root is present in an autoregressive model.
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries):
#Determing rolling statistics
rolmean = pd.rolling_mean(timeseries, window=12)
rolstd = pd.rolling_std(timeseries, window=12)
#Plot rolling statistics:
orig = plt.plot(timeseries, color='blue',label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)
#Perform Dickey-Fuller test:
print('Results of Dickey-Fuller Test:')
dftest = adfuller(timeseries, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
print(dfoutput)
test_stationarity(ts_eng)
testE = delta_df["Energy Changes"]
E_log = np.log(avg_T.iloc[-365:,:]['Health Care'])
E_log_diff = E_log - E_log.shift()
plt.plot(E_log_diff)
fig = plt.figure(figsize=(8,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(E_log_diff[1:].values.squeeze(),lags = 40,ax=ax1)
plt.ylim([-0.15,0.15])
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(E_log_diff[1:],lags =40, ax= ax2)
plt.ylim([-0.15,0.15])
E_log_diff.dropna(inplace=True)
test_stationarity(E_log_diff)